##################### -  Environment - ##################### 
rm(list=objects())
setwd("~/Desktop/Experiments/Primary School/Data")

library(missSBM)
library(igraph)
library(softImpute)

##################### -  Data pre-processing - ##################### 
# load data for Day 1 and Day 2
data.1 <- read.graph("primary1.GML", format = "gml")
data.2 <- read.graph("primary2.GML", format = "gml")

# Adjacency matrices for both days
A.1 <- as.matrix(as_adjacency_matrix(data.1))
remove.edge.1 <- E(data.1)[get.edge.attribute(data.1, "duration")<= 60] # remove edge with duration <= 60 seconds
new.A.1 <- as.matrix(as_adjacency_matrix(data.1 - remove.edge.1))
A.1[new.A.1 - A.1 < 0] <- NA
diag(A.1) <- NA
A.1 <- A.1[order(V(data.1)$id), order(V(data.1)$id)]
colnames(A.1) <- sort(V(data.1)$id)
rownames(A.1) <- sort(V(data.1)$id)

A.2 <- as.matrix(as_adjacency_matrix(data.2))
remove.edge.2 <- E(data.2)[get.edge.attribute(data.2, "duration")<= 60] # remove edge with duration <= 60 seconds
new.A.2 <- as.matrix(as_adjacency_matrix(data.2 - remove.edge.2))
A.2[new.A.2 - A.2 < 0] <- NA
diag(A.2) <- NA
A.2 <- A.2[order(V(data.2)$id), order(V(data.2)$id)]
colnames(A.2) <- sort(V(data.2)$id)
rownames(A.2) <- sort(V(data.2)$id)

# Remove children that where absent either on Day 1 or on Day 2
absent.day.1 <- which(!(sort(V(data.1)$id) %in% V(data.2)$id))
absent.day.2 <- which(!(sort(V(data.2)$id) %in% V(data.1)$id))
A.1 <- A.1[-absent.day.1, -absent.day.1]
A.2 <- A.2[-absent.day.2, -absent.day.2]
N <- dim(A.1)[1]

# Statistics on the networks
sum(is.na(A.1))/N**2
sum(is.na(A.2))/N**2
sum(!is.na(A.2))/2
mean(colSums(A.1, na.rm = T)) #20
max(colSums(A.1, na.rm = T)) #41
min(colSums(A.1, na.rm = T)) #5

##################### -  Estimate the probabilities of connections - ##################### 
# Estimate network using missSBM
estimator_missSBM <- missSBM::estimateMissSBM(
  adjacencyMatrix = A.1, 
  vBlocks = 2:20,
  sampling = "dyad",
  control = list(trace = 0))$bestModel$fittedSBM
estimate_Theta_missSBM <- estimator_missSBM$expectation

# Estimate network using the variational estimate of z
z_est <- estimator_missSBM$memberships
K <- length(unique(z_est))
estimate_Q_Var <- sapply(1:K, function(a) sapply(1:K, function(b) mean(A.1[z_est == a, z_est == b], na.rm =T)))
estimate_Q_Var[is.na(estimate_Q_Var)] <- 0
estimate_Theta_Var <- sapply(1:N, function(i) sapply(1:N, function(j) estimate_Q_Var[z_est[i], z_est[j]]))
diag(estimate_Theta_Var) <- 0

# Estimate network using SVD
SVD <- softImpute(A.1, rank.max = K, lambda = 0)
estimate_Theta_softImpute <- SVD$u %*% diag(SVD$d, nrow = K, ncol = K) %*% t(SVD$v)
estimate_Theta_softImpute <- pmin(pmax(estimate_Theta_softImpute, 0),1)

# Estimate network using the observations of Day 1
estimator_Day_1 <- A.1
estimator_Day_1[is.na(estimator_Day_1)] <- mean(A.1, na.rm = T) # estimate unobserved edges with average connection probabilities
diag(estimator_Day_1) <- 0 # set the diagonal to 0

print(paste0("Normalised SE of Variational Estimator : ", round(sum((A.2 - estimate_Theta_Var)**2/sum(A.2, na.rm = T), na.rm = T),3)))
print(paste0("Normalised SE of missSBM : ", round(sum((A.2 - estimate_Theta_missSBM)**2/sum(A.2, na.rm = T), na.rm = T), 3)))
print(paste0("Normalised SE of SVD : ", round(sum((A.2 - estimate_Theta_softImpute)**2/sum(A.2, na.rm = T), na.rm = T), 3)))
print(paste0("Normalised SE of Naive Estimator : ", round(sum((A.2 - estimator_Day_1)**2/sum(A.2, na.rm = T), na.rm = T), 3)))
round(100*(1-sum((A.2 - estimate_Theta_Var)**2/sum(A.2, na.rm = T), na.rm = T)/sum((A.2 - estimate_Theta_softImpute)**2/sum(A.2, na.rm = T), na.rm = T)),1)


# RO Curves
thresholds <- c(-10,(-0:200)/200,10)

Recall_VAR <- rep(NA, 203)
Precision_VAR <- rep(NA, 203)

Recall_missSBM <- rep(NA, 203)
Precision_missSBM <- rep(NA, 203)

Recall_softImpute <- rep(NA, 203)
Precision_softImpute <- rep(NA, 203)

for (t in 1:203){
  
  # Using the variational estimator
  A_VAR <- 1*(estimate_Theta_Var>thresholds[t])
  TP <- sum((A_VAR == 1 & !is.na(A.2) & A.2 == 1))
  FP <- sum((A_VAR == 1 & !is.na(A.2) & A.2 == 0))
  TN <- sum((A_VAR == 0 & !is.na(A.2) & A.2 == 0))
  FN <- sum((A_VAR == 0 & !is.na(A.2) & A.2 == 1))
  Recall_VAR[t] <- TP/(TP+FN)
  Precision_VAR[t] <- TP/(TP+FP)
  Precision_VAR[is.na(Precision_VAR)] <- 1
  
  # Using missSBM
  A_missSBM <- 1*(estimate_Theta_missSBM>thresholds[t])
  TP <- sum((A_missSBM == 1 & !is.na(A.2) & A.2 == 1))
  FP <- sum((A_missSBM == 1 & !is.na(A.2) & A.2 == 0))
  TN <- sum((A_missSBM == 0 & !is.na(A.2) & A.2 == 0))
  FN <- sum((A_missSBM == 0 & !is.na(A.2) & A.2 == 1))
  Recall_missSBM[t] <- TP/(TP+FN)
  Precision_missSBM[t] <- TP/(TP+FP)
  Precision_missSBM[is.na(Precision_missSBM)] <- 1
  
  # Using softImpute
  A_softImpute <- 1*(estimate_Theta_softImpute>thresholds[t])
  TP <- sum((A_softImpute == 1 & !is.na(A.2) & A.2 == 1))
  FP <- sum((A_softImpute == 1 & !is.na(A.2) & A.2 == 0))
  TN <- sum((A_softImpute == 0 & !is.na(A.2) & A.2 == 0))
  FN <- sum((A_softImpute == 0 & !is.na(A.2) & A.2 == 1))
  Recall_softImpute[t] <- TP/(TP+FN)
  Precision_softImpute[t] <- TP/(TP+FP)
  Precision_softImpute[is.na(Precision_softImpute)] <- 1
}

plot(x = Recall_VAR, y = Precision_VAR, type = 'l', col = 'blue', 
     xlab = "Recall", ylab = "Precision", ylim = c(0,1), xlim = c(0,1))
lines(x = Recall_missSBM, y = Precision_missSBM, col = 'red')
lines(x = Recall_softImpute, y = Precision_softImpute, col = 'green')
lines(x = Recall_softImpute, y = rep(mean(A.2, na.rm = T), length(Recall_softImpute)), lty = 3, col = 'black')

